import pymongo
import requests
from bs4 import BeautifulSoup
import time

'''
#page_list > ul > li:nth-child(1) > a
'''

page_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1, 3)]
headers = {
    'Cookie': 'abtest_ABTest4SearchDate=b; _ga=GA1.2.1690105385.1516269802; _gid=GA1.2.253349146.1516269802; gr_user_id=b3adc427-e44a-4586-ba33-eb42b5b10c2d; __utmz=29082403.1516269803.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); xzuuid=d5cc1d02; _gat_UA-33763849-7=1; gr_session_id_59a81cc7d8c04307ba183d331c373ef6=b24a890a-e885-4f4c-ad93-f3cb3cadc81a; __utma=29082403.1690105385.1516269802.1516327198.1516339152.3; __utmc=29082403; __utmt=1; __utmb=29082403.1.10.1516339152',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Host': 'bj.xiaozhu.com'
}


def get_page_hrefs(url, headers):
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    # print(soup)
    a_hrefs = soup.select("#page_list > ul > li > a")
    return a_hrefs


href = 'http://bj.xiaozhu.com/fangzi/2803985763.html'


def get_house_data(url, headers):
    single_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(single_data.text, 'lxml')
    # 爬虫内容：标题，地址，日租金，第一张房源图片，房东图片链接，房东性别，房东名字
    titles = soup.select('div.pho_info > h4 > em')
    addresss = soup.select('div.pho_info > p > span')
    prices = soup.select('#pricePart > div.day_l > span')
    housePics = soup.select('#curBigImage')
    ownerPics = soup.select('div.member_pic > a > img')
    ownerSexs = soup.select('div.w_240 > h6 > span')
    ownerNames = soup.select('div.w_240 > h6 > a')

    data = []
    for title, address, price, housePic, ownerPic, ownerSex, ownerName in zip(
            titles, addresss, prices, housePics, ownerPics, ownerSexs, ownerNames):
        info = {
            'title': title.get_text(),
            'address': address.get_text()[:address.get_text().find(' ')],
            'price': price.get_text(),
            'housePic': housePic.get('src'),
            'ownerPic': ownerPic.get('src'),
            'ownerSex': 'female' if ownerSex.get('class')[0] == 'member_girl_ico' else 'male',
            'ownerName': ownerName.get_text()
        }
        # print(info)
        data.append(info)
    return data


def xiaozhu_run(page_urls, headers):
    datas = []
    for page_url in page_urls:
        time.sleep(2)
        a_hrefs = get_page_hrefs(page_url, headers)

        for a_href in a_hrefs:
            time.sleep(2)
            url = a_href.get('href')
            data = get_house_data(url, headers)
            datas.extend(data)
    return datas


client = pymongo.MongoClient('192.168.107.128', 27017)
xiaozhu = client['xiaozhu']
bj_duanzhufang = xiaozhu['bj_duanzhufang']

datas = xiaozhu_run(page_urls, headers)
for data in datas:
    print(data)
    bj_duanzhufang.insert_one(data)
